/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * and re-licensed under GPLv2 for the Linux kernel. The original code can
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
#include <seminix/pagesize.h>
#include <seminix/linkage.h>
#include <seminix/cache.h>
#include <asm/assembler.h>

/*
 * Move a buffer from src to test (alignment handled by the hardware).
 * If dest <= src, call memcpy, otherwise copy in reverse order.
 *
 * Parameters:
 *	x0 - dest
 *	x1 - src
 *	x2 - n
 * Returns:
 *	x0 - dest
 */
dstin	.req	x0
src	.req	x1
count	.req	x2
tmp1	.req	x3
tmp1w	.req	w3
tmp2	.req	x4
tmp2w	.req	w4
tmp3	.req	x5
tmp3w	.req	w5
dst	.req	x6

A_l	.req	x7
A_h	.req	x8
B_l	.req	x9
B_h	.req	x10
C_l	.req	x11
C_h	.req	x12
D_l	.req	x13
D_h	.req	x14

    .global memmove
ENTRY(__memmove)
ENTRY(memmove)
    cmp	dstin, src
    b.lo	__memcpy
    add	tmp1, src, count
    cmp	dstin, tmp1
    b.hs	__memcpy		/* No overlap.  */

    add	dst, dstin, count
    add	src, src, count
    cmp	count, #16
    b.lo	.Ltail15  /*probably non-alignment accesses.*/

    ands	tmp2, src, #15     /* Bytes to reach alignment.  */
    b.eq	.LSrcAligned
    sub	count, count, tmp2
    /*
    * process the aligned offset length to make the src aligned firstly.
    * those extra instructions' cost is acceptable. It also make the
    * coming accesses are based on aligned address.
    */
    tbz	tmp2, #0, 1f
    ldrb	tmp1w, [src, #-1]!
    strb	tmp1w, [dst, #-1]!
1:
    tbz	tmp2, #1, 2f
    ldrh	tmp1w, [src, #-2]!
    strh	tmp1w, [dst, #-2]!
2:
    tbz	tmp2, #2, 3f
    ldr	tmp1w, [src, #-4]!
    str	tmp1w, [dst, #-4]!
3:
    tbz	tmp2, #3, .LSrcAligned
    ldr	tmp1, [src, #-8]!
    str	tmp1, [dst, #-8]!

.LSrcAligned:
    cmp	count, #64
    b.ge	.Lcpy_over64

    /*
    * Deal with small copies quickly by dropping straight into the
    * exit block.
    */
.Ltail63:
    /*
    * Copy up to 48 bytes of data. At this point we only need the
    * bottom 6 bits of count to be accurate.
    */
    ands	tmp1, count, #0x30
    b.eq	.Ltail15
    cmp	tmp1w, #0x20
    b.eq	1f
    b.lt	2f
    ldp	A_l, A_h, [src, #-16]!
    stp	A_l, A_h, [dst, #-16]!
1:
    ldp	A_l, A_h, [src, #-16]!
    stp	A_l, A_h, [dst, #-16]!
2:
    ldp	A_l, A_h, [src, #-16]!
    stp	A_l, A_h, [dst, #-16]!

.Ltail15:
    tbz	count, #3, 1f
    ldr	tmp1, [src, #-8]!
    str	tmp1, [dst, #-8]!
1:
    tbz	count, #2, 2f
    ldr	tmp1w, [src, #-4]!
    str	tmp1w, [dst, #-4]!
2:
    tbz	count, #1, 3f
    ldrh	tmp1w, [src, #-2]!
    strh	tmp1w, [dst, #-2]!
3:
    tbz	count, #0, .Lexitfunc
    ldrb	tmp1w, [src, #-1]
    strb	tmp1w, [dst, #-1]

.Lexitfunc:
    ret

.Lcpy_over64:
    subs	count, count, #128
    b.ge	.Lcpy_body_large
    /*
    * Less than 128 bytes to copy, so handle 64 bytes here and then jump
    * to the tail.
    */
    ldp	A_l, A_h, [src, #-16]
    stp	A_l, A_h, [dst, #-16]
    ldp	B_l, B_h, [src, #-32]
    ldp	C_l, C_h, [src, #-48]
    stp	B_l, B_h, [dst, #-32]
    stp	C_l, C_h, [dst, #-48]
    ldp	D_l, D_h, [src, #-64]!
    stp	D_l, D_h, [dst, #-64]!

    tst	count, #0x3f
    b.ne	.Ltail63
    ret

    /*
    * Critical loop. Start at a new cache line boundary. Assuming
    * 64 bytes per line this ensures the entire loop is in one line.
    */
    .p2align	L1_CACHE_SHIFT
.Lcpy_body_large:
    /* pre-load 64 bytes data. */
    ldp	A_l, A_h, [src, #-16]
    ldp	B_l, B_h, [src, #-32]
    ldp	C_l, C_h, [src, #-48]
    ldp	D_l, D_h, [src, #-64]!
1:
    /*
    * interlace the load of next 64 bytes data block with store of the last
    * loaded 64 bytes data.
    */
    stp	A_l, A_h, [dst, #-16]
    ldp	A_l, A_h, [src, #-16]
    stp	B_l, B_h, [dst, #-32]
    ldp	B_l, B_h, [src, #-32]
    stp	C_l, C_h, [dst, #-48]
    ldp	C_l, C_h, [src, #-48]
    stp	D_l, D_h, [dst, #-64]!
    ldp	D_l, D_h, [src, #-64]!
    subs	count, count, #64
    b.ge	1b
    stp	A_l, A_h, [dst, #-16]
    stp	B_l, B_h, [dst, #-32]
    stp	C_l, C_h, [dst, #-48]
    stp	D_l, D_h, [dst, #-64]!

    tst	count, #0x3f
    b.ne	.Ltail63
    ret
ENDPIPROC(memmove)
ENDPROC(__memmove)
